In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
In [20]:
pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',100)

Reading DataSet

In [21]:
df = pd.read_excel('HR_Employee_Attrition Dataset.xlsx')
In [22]:
df.head()
Out[22]:
EmployeeNumber Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 Yes 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 2 No 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 3 Yes 37 Travel_Rarely 1373 Research & Development 2 2 Other 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 4 No 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 5 No 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
In [23]:
df.shape
Out[23]:
(2940, 34)
In [24]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2940 entries, 0 to 2939
Data columns (total 34 columns):
EmployeeNumber              2940 non-null int64
Attrition                   2940 non-null object
Age                         2940 non-null int64
BusinessTravel              2940 non-null object
DailyRate                   2940 non-null int64
Department                  2940 non-null object
DistanceFromHome            2940 non-null int64
Education                   2940 non-null int64
EducationField              2940 non-null object
EnvironmentSatisfaction     2940 non-null int64
Gender                      2940 non-null object
HourlyRate                  2940 non-null int64
JobInvolvement              2940 non-null int64
JobLevel                    2940 non-null int64
JobRole                     2940 non-null object
JobSatisfaction             2940 non-null int64
MaritalStatus               2940 non-null object
MonthlyIncome               2940 non-null int64
MonthlyRate                 2940 non-null int64
NumCompaniesWorked          2940 non-null int64
Over18                      2940 non-null object
OverTime                    2940 non-null object
PercentSalaryHike           2940 non-null int64
PerformanceRating           2940 non-null int64
RelationshipSatisfaction    2940 non-null int64
StandardHours               2940 non-null int64
StockOptionLevel            2940 non-null int64
TotalWorkingYears           2940 non-null int64
TrainingTimesLastYear       2940 non-null int64
WorkLifeBalance             2940 non-null int64
YearsAtCompany              2940 non-null int64
YearsInCurrentRole          2940 non-null int64
YearsSinceLastPromotion     2940 non-null int64
YearsWithCurrManager        2940 non-null int64
dtypes: int64(25), object(9)
memory usage: 781.1+ KB

Attrition is around 16%

In [25]:
df.Attrition.value_counts(normalize=True)
Out[25]:
No     0.838776
Yes    0.161224
Name: Attrition, dtype: float64

Inspecting for Nulls

In [26]:
df.isnull().sum()
Out[26]:
EmployeeNumber              0
Attrition                   0
Age                         0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64
In [27]:
df.JobInvolvement.value_counts()
Out[27]:
3    1736
2     750
4     288
1     166
Name: JobInvolvement, dtype: int64
In [28]:
df.describe().transpose()
Out[28]:
count mean std min 25% 50% 75% max
EmployeeNumber 2940.0 1470.500000 848.849221 1.0 735.75 1470.5 2205.25 2940.0
Age 2940.0 36.923810 9.133819 18.0 30.00 36.0 43.00 60.0
DailyRate 2940.0 802.485714 403.440447 102.0 465.00 802.0 1157.00 1499.0
DistanceFromHome 2940.0 9.192517 8.105485 1.0 2.00 7.0 14.00 29.0
Education 2940.0 2.912925 1.023991 1.0 2.00 3.0 4.00 5.0
EnvironmentSatisfaction 2940.0 2.721769 1.092896 1.0 2.00 3.0 4.00 4.0
HourlyRate 2940.0 65.891156 20.325969 30.0 48.00 66.0 84.00 100.0
JobInvolvement 2940.0 2.729932 0.711440 1.0 2.00 3.0 3.00 4.0
JobLevel 2940.0 2.063946 1.106752 1.0 1.00 2.0 3.00 5.0
JobSatisfaction 2940.0 2.728571 1.102658 1.0 2.00 3.0 4.00 4.0
MonthlyIncome 2940.0 6502.931293 4707.155770 1009.0 2911.00 4919.0 8380.00 19999.0
MonthlyRate 2940.0 14313.103401 7116.575021 2094.0 8045.00 14235.5 20462.00 26999.0
NumCompaniesWorked 2940.0 2.693197 2.497584 0.0 1.00 2.0 4.00 9.0
PercentSalaryHike 2940.0 15.209524 3.659315 11.0 12.00 14.0 18.00 25.0
PerformanceRating 2940.0 3.153741 0.360762 3.0 3.00 3.0 3.00 4.0
RelationshipSatisfaction 2940.0 2.712245 1.081025 1.0 2.00 3.0 4.00 4.0
StandardHours 2940.0 80.000000 0.000000 80.0 80.00 80.0 80.00 80.0
StockOptionLevel 2940.0 0.793878 0.851932 0.0 0.00 1.0 1.00 3.0
TotalWorkingYears 2940.0 11.279592 7.779458 0.0 6.00 10.0 15.00 40.0
TrainingTimesLastYear 2940.0 2.799320 1.289051 0.0 2.00 3.0 3.00 6.0
WorkLifeBalance 2940.0 2.761224 0.706356 1.0 2.00 3.0 3.00 4.0
YearsAtCompany 2940.0 7.008163 6.125483 0.0 3.00 5.0 9.00 40.0
YearsInCurrentRole 2940.0 4.229252 3.622521 0.0 2.00 3.0 7.00 18.0
YearsSinceLastPromotion 2940.0 2.187755 3.221882 0.0 0.00 1.0 3.00 15.0
YearsWithCurrManager 2940.0 4.123129 3.567529 0.0 2.00 3.0 7.00 17.0

Univariate Analysis

In [29]:
sns.set_style("darkgrid")
f, axes = plt.subplots(2,4,figsize=(30,15))
sns.distplot(df.Age,ax=axes[0,0]);
sns.distplot(df.DailyRate,ax=axes[0,1]);
sns.distplot(df.TotalWorkingYears,ax=axes[0,2]);
sns.distplot(df.YearsSinceLastPromotion,ax=axes[0,3]);
sns.distplot(df.YearsAtCompany,ax=axes[1,0]);
sns.distplot(df.YearsSinceLastPromotion,ax=axes[1,1]);
sns.distplot(df.YearsWithCurrManager,ax=axes[1,2]);
sns.distplot(df.StandardHours,ax=axes[1,3]);
In [30]:
sns.set_style("darkgrid")
f, axes = plt.subplots(2,3,figsize=(30,15))
sns.countplot(df.BusinessTravel,ax=axes[0,0]);
sns.countplot(df.Department,ax=axes[0,1]);
sns.countplot(df.EducationField,ax=axes[0,2]);
sns.countplot(df.Gender,ax=axes[1,0]);
chart = sns.countplot(df.JobRole,ax=axes[1,1]);
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
sns.countplot(df.MaritalStatus,ax=axes[1,2]);

Multivariate Analysis

In [31]:
sns.scatterplot( x= "Age", y = "DailyRate" ,data=df, hue = 'Attrition')
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f94be11bd10>
In [32]:
sns.pairplot(df, hue='Attrition');

EmployeeNumber and Standard Hours can be dropped

In [33]:
df.drop(columns = ['EmployeeNumber'],inplace=True)
In [34]:
df.drop(columns = ['StandardHours'],inplace=True)
In [35]:
plt.figure(figsize = (20,15))
sns.heatmap(df.corr(),annot=True,cmap='Blues')
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f94bce33410>
In [36]:
sns.scatterplot( x= "JobLevel", y = "MonthlyIncome",data=df)
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f948aac2510>

Dropping correlated column having correlation cofficient more than 0.7

In [37]:
# Create correlation matrix
def drop_corr(df):
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find features with correlation greater than 0.7
    to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]

    # Drop features 
    df1 = df.drop(to_drop, axis=1)
    return df1
In [38]:
df = drop_corr(df)
In [39]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2940 entries, 0 to 2939
Data columns (total 27 columns):
Attrition                   2940 non-null object
Age                         2940 non-null int64
BusinessTravel              2940 non-null object
DailyRate                   2940 non-null int64
Department                  2940 non-null object
DistanceFromHome            2940 non-null int64
Education                   2940 non-null int64
EducationField              2940 non-null object
EnvironmentSatisfaction     2940 non-null int64
Gender                      2940 non-null object
HourlyRate                  2940 non-null int64
JobInvolvement              2940 non-null int64
JobLevel                    2940 non-null int64
JobRole                     2940 non-null object
JobSatisfaction             2940 non-null int64
MaritalStatus               2940 non-null object
MonthlyRate                 2940 non-null int64
NumCompaniesWorked          2940 non-null int64
Over18                      2940 non-null object
OverTime                    2940 non-null object
PercentSalaryHike           2940 non-null int64
RelationshipSatisfaction    2940 non-null int64
StockOptionLevel            2940 non-null int64
TrainingTimesLastYear       2940 non-null int64
WorkLifeBalance             2940 non-null int64
YearsAtCompany              2940 non-null int64
YearsSinceLastPromotion     2940 non-null int64
dtypes: int64(18), object(9)
memory usage: 620.3+ KB
In [40]:
categorical_features = list(df.select_dtypes(include=['O']).columns)
categorical_features
Out[40]:
['Attrition',
 'BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime']
In [41]:
## converting categorical feature to 'Category type'
for column in categorical_features:
    df[column] = df[column].astype('category')
In [42]:
# select the categorical columns
category_columns = df.select_dtypes(['category']).columns
In [43]:
# convert each columns to code
df[category_columns] = df[category_columns].apply(lambda x: x.cat.codes)
In [44]:
df.head()
Out[44]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike RelationshipSatisfaction StockOptionLevel TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsSinceLastPromotion
0 1 41 2 1102 2 1 2 1 2 0 94 3 2 7 4 2 19479 8 0 1 11 1 0 0 1 6 0
1 0 49 1 279 1 8 1 1 3 1 61 2 2 6 2 1 24907 1 0 0 23 4 1 3 3 10 1
2 1 37 2 1373 1 2 2 4 4 1 92 2 1 2 3 2 2396 6 0 1 15 2 0 3 3 0 0
3 0 33 1 1392 1 3 4 1 4 0 56 3 1 6 3 1 23159 1 0 1 11 3 0 3 3 8 3
4 0 27 2 591 1 2 1 3 1 1 40 3 1 2 2 1 16632 9 0 0 12 4 1 3 3 2 2
In [45]:
y = df['Attrition']
X = df.drop(['Attrition'],axis=1)
In [46]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
In [47]:
cat_list = category_columns.to_list()
cat_list.remove('Attrition')
In [48]:
cat_list
Out[48]:
['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime']

Splitting Data set b/w Train/Test

In [49]:
# numerical_columns = X.drop(cat_list,axis=1).columns.tolist()

# X[numerical_columns] = scaler.fit_transform(X[numerical_columns])
In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
In [51]:
from sklearn.model_selection import cross_val_score,KFold
k =  KFold(random_state=7,n_splits=5)
In [94]:
from sklearn.pipeline import Pipeline
model= []
tr = []
te = []
f1 = []
auc = []

Model Building, as its classificatin problme we will be using below Classification Algorithms:

Logistici Regression

Decesion Tree

Bagging Classifier

AdaBoost

Gradient Boost

In [95]:
from sklearn.metrics import f1_score, roc_auc_score
In [96]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('clf', LogisticRegression())
])

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


model.append('Logistic Regression')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
In [97]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('clf', DecisionTreeClassifier(random_state=7))
])

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


model.append('Decision Tree')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
In [98]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('clf', RandomForestClassifier(random_state=7))
])

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


model.append('Random Forest')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
In [99]:
# Bagging
from sklearn.ensemble import BaggingClassifier
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('clf', BaggingClassifier(random_state=7))
])

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


model.append('Bagging')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
In [100]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('reg', AdaBoostClassifier(random_state=7))
])

pipeline.fit(X_train,y_train)


model.append('AdaBoost')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
In [101]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('clf', GradientBoostingClassifier(random_state=7))
])

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


model.append('Gradient Boosting')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
In [ ]:
 
In [102]:
# DataFrame to compare results.

results = pd.DataFrame()
results['Model'] = model
results['Training Score'] = tr
results['Testing Score'] = te
results['F1 Score'] = f1
results['ROC/AUC Score'] = auc
results = results.set_index('Model')
results.sort_values('Testing Score', ascending=False)
Out[102]:
Training Score Testing Score F1 Score ROC/AUC Score
Model
Random Forest 1.000000 0.867434 0.912088 0.919192
Gradient Boosting 0.942602 0.865711 0.649007 0.744407
Logistic Regression 0.870748 0.862350 0.480000 0.666481
Bagging 0.994473 0.852093 0.832370 0.861591
AdaBoost 0.893707 0.848761 0.832370 0.861591
Decision Tree 1.000000 0.796002 0.861386 0.923034
In [120]:
ss = StandardScaler()
X_train = pd.DataFrame(ss.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(ss.transform(X_test),columns=X_test.columns)

Hyperparameter tunning using GridSearchCV

In [121]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=7)

params = {
    'bootstrap': [True,False],
    'max_depth': [3,4,None],
    'max_features': ['sqrt','log2'],
    'min_samples_leaf': [1,3, 4],
    'min_samples_split': [2,3,5],
    'n_estimators': [25,50,'warn']
}    

grid = GridSearchCV(estimator = rf, param_grid = params, cv = k)

grid.fit(X_train,y_train)
Out[121]:
GridSearchCV(cv=KFold(n_splits=5, random_state=7, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fr...
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=7,
                                              verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'bootstrap': [True, False], 'max_depth': [3, 4, None],
                         'max_features': ['sqrt', 'log2'],
                         'min_samples_leaf': [1, 3, 4],
                         'min_samples_split': [2, 3, 5],
                         'n_estimators': [25, 50, 'warn']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
In [122]:
model = ['Random forest after Grid search']
y_pred = grid.predict(X_test)
tr = [grid.score(X_train,y_train)]
te = [grid.score(X_test,y_test)]
f1 = [f1_score(y_test,y_pred)]
auc = [roc_auc_score(y_test,y_pred)]
In [123]:
gb = GradientBoostingClassifier(random_state=7)
gb.fit(X_train,y_train)
Out[123]:
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=7, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

Hyperparameter tunning using RandomizedSearchCV

In [124]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50 , stop = 150, num = 25)]   # returns evenly spaced 25 numbers
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 10, num = 5)]  # returns evenly spaced numbers can be changed to any
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,3,4,5,6,7,8,9,10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4]
# Method of selecting samples for training each tree
learning_rate = [float(x) for x in np.linspace(0.1, 1, num = 10)]

# Create the random grid
params_r = {'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'learning_rate':learning_rate}

# Use the random grid to search for best hyperparameters

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
random = RandomizedSearchCV(estimator=gb, param_distributions=params_r,cv = k, random_state=7)

# Fit the random search model
random.fit(X_train, y_train)
Out[124]:
RandomizedSearchCV(cv=KFold(n_splits=5, random_state=7, shuffle=False),
                   error_score=nan,
                   estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                        criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_we...
                                                          0.7000000000000001,
                                                          0.8, 0.9, 1.0],
                                        'max_depth': [5, 6, 7, 8, 10, None],
                                        'max_features': ['log2', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4],
                                        'min_samples_split': [2, 3, 4, 5, 6, 7,
                                                              8, 9, 10],
                                        'n_estimators': [50, 54, 58, 62, 66, 70,
                                                         75, 79, 83, 87, 91, 95,
                                                         100, 104, 108, 112,
                                                         116, 120, 125, 129,
                                                         133, 137, 141, 145,
                                                         150]},
                   pre_dispatch='2*n_jobs', random_state=7, refit=True,
                   return_train_score=False, scoring=None, verbose=0)
In [125]:
random.best_params_
Out[125]:
{'n_estimators': 108,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 7,
 'learning_rate': 0.6}
In [126]:
model.append('Gradient Boosting after RandomCV')
y_pred = random.predict(X_test)
tr.append(random.score(X_train,y_train))
te.append(random.score(X_test,y_test))
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
In [127]:
model,tr,te,f1,auc
Out[127]:
(['Random forest after Grid search', 'Gradient Boosting after RandomCV'],
 [1.0, 1.0],
 [0.9761904761904762, 0.9863945578231292],
 [0.9239130434782609, 0.9587628865979383],
 [0.9292929292929293, 0.9676519799219186])

DataFrame to compare results.

  • Gradient Boosting gives best Testing Score after Hyper parameter tuning
  • Both Random Forecast and GB is overfitting but Testing Accuracy is good so model is acceptable
In [128]:
results = pd.DataFrame()
results['Model'] = model
results['Training Score'] = tr
results['Testing Score'] = te
results['F1 Score'] = f1
results['ROC/AUC Score'] = auc
results = results.set_index('Model').sort_values('Testing Score',ascending=False)
results
Out[128]:
Training Score Testing Score F1 Score ROC/AUC Score
Model
Gradient Boosting after RandomCV 1.0 0.986395 0.958763 0.967652
Random forest after Grid search 1.0 0.976190 0.923913 0.929293
In [ ]:
 
In [ ]: